import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
logging.info("check")
import glob
import os
def cmp_key(s):
fst, snd, _ = os.path.basename(s).split("_")
fst = int(fst.replace("ep", ""))
snd = int(snd.replace("docs", "").replace("doc", ""))
return (fst, snd)
models = sorted(filter(lambda s: s.count(".") == 1 and "250000" not in s, glob.glob("models/lda/*")), key=cmp_key)
keyed_models = zip(map(cmp_key, models), models)
keyed_models
import plotly.offline as py
import plotly.graph_objs as go
py.init_notebook_mode()
def draw_heatmap(z, title="", x_title="", y_title="", text=[]):
data = [
go.Heatmap(
z=z,
colorscale='RdBu',
text=text
)
]
layout = go.Layout(
width=950,
height=950,
title=title,
xaxis=dict(title=x_title),
yaxis=dict(title=y_title))
py.iplot(go.Figure(data=data, layout=layout))
P/S We are not limited to LDA. Required matrix $\Phi$ $(Topics \times Dictionary)$ and $Dictionary$ (optional, for jaccard distance & annotations)
import numpy as np
from gensim.models import LdaMulticore
from gensim.matutils import kullback_leibler, hellinger
def topic2topic_diff_probs(m1, m2, distance="kulback_leibler"):
distances = {"kulback_leibler": kullback_leibler,
"hellinger": hellinger}
assert distance in distances, "Incorrect distance, valid only {}".format(", ".join(distances.keys()))
distance_func = distances[distance]
d1, d2 = m1.state.get_lambda(), m2.state.get_lambda()
t1_size, t2_size = d1.shape[0], d2.shape[0]
z = np.zeros((t1_size, t2_size))
for topic1 in range(t1_size):
for topic2 in range(t2_size):
if topic2 < topic1:
continue
curr_kl = distance_func(d1[topic1], d2[topic2])
z[topic1][topic2] = curr_kl
z[topic2][topic1] = curr_kl
return z / np.max(z)
from gensim.models import LdaMulticore
import numpy as np
def jaccard(s1, s2):
return 1. - float(len(s1 & s2)) / float(len(s1 | s2))
def topic2topic_diff_jcd(m1, m2, num_words=100):
t1_size, t2_size = m1.state.get_lambda().shape[0], m2.state.get_lambda().shape[0]
z = np.zeros((t1_size, t2_size))
fst_topics = [{w for (w, _) in m1.show_topic(topic, topn=num_words)} for topic in range(t1_size)]
snd_topics = [{w for (w, _) in m2.show_topic(topic, topn=num_words)} for topic in range(t2_size)]
for topic1 in range(t1_size):
for topic2 in range(t2_size):
if topic2 < topic1:
continue
curr_jcd = jaccard(fst_topics[topic1], snd_topics[topic2])
z[topic1][topic2] = curr_jcd
z[topic2][topic1] = curr_jcd
return z
from gensim.models import LdaMulticore
from random import sample
import numpy as np
def topic2topic_text(m1, m2, num_words=100, topw=10):
t1_size, t2_size = m1.state.get_lambda().shape[0], m2.state.get_lambda().shape[0]
txt = [["" for _ in range(t1_size)] for __ in range(t2_size)]
fst_topics = [{w for (w, _) in m1.show_topic(topic, topn=num_words)} for topic in range(t1_size)]
snd_topics = [{w for (w, _) in m2.show_topic(topic, topn=num_words)} for topic in range(t2_size)]
for topic1 in range(t1_size):
for topic2 in range(t2_size):
if topic2 < topic1:
continue
pos_tokens = fst_topics[topic1] & snd_topics[topic2]
neg_tokens = fst_topics[topic1].symmetric_difference(snd_topics[topic2])
pos_tokens = sample(pos_tokens, min(len(pos_tokens), topw))
neg_tokens = sample(neg_tokens, min(len(neg_tokens), topw))
res = "+++ {}<br>--- {}".format(", ".join(pos_tokens), ", ".join(neg_tokens))
txt[topic1][topic2] = res
txt[topic2][topic1] = res
return txt
p1, p2 = keyed_models[4][1], keyed_models[12][1] # choice two random models
m1, m2 = LdaMulticore.load(p1), LdaMulticore.load(p2)
%time z_jcd = topic2topic_diff_jcd(m1, m2)
%time z_kl = topic2topic_diff_probs(m1, m2, distance="kulback_leibler")
%time z_hellinger = topic2topic_diff_probs(m1, m2, distance="hellinger")
%time annotation_text = topic2topic_text(m1, m2)
draw_heatmap(z_jcd, title="Topic difference [Jaccard]", x_title="Topic", y_title="Topic", text=annotation_text)
draw_heatmap(z_kl,title="Topic difference [KL]", x_title="Topic", y_title="Topic", text=annotation_text)
draw_heatmap(z_hellinger, title="Topic difference [Hellinger]", x_title="Topic", y_title="Topic", text=annotation_text)
def smodel_diff_kl(models_pths, distance="kulback_leibler", num_words=100):
distances = {"kulback_leibler": kullback_leibler,
"hellinger": hellinger}
assert distance in distances, "Incorrect distance, valid only {}".format(", ".join(distances.keys()))
distance_func = distances[distance]
z = []
for (p1, p2) in zip(models_pths, models_pths[1:]):
m1, m2 = LdaMulticore.load(p1).state.get_lambda(), LdaMulticore.load(p2).state.get_lambda()
assert m1.shape[0] == m2.shape[0]
curr_diffs = []
for topic in range(m1.shape[0]):
curr_diffs.append(distance_func(m1[topic], m2[topic]))
z.append(curr_diffs)
return (z / np.max(z)).T
def smodel_diff_jcd(models_pths, num_words=100):
z = []
for (p1, p2) in zip(models_pths, models_pths[1:]):
m1, m2 = LdaMulticore.load(p1), LdaMulticore.load(p2)
topic_num = m1.state.get_lambda().shape[0]
assert topic_num == m2.state.get_lambda().shape[0]
curr_diffs = []
for topic in range(topic_num):
fst = {w for (w, _) in m1.show_topic(topic, topn=num_words)}
snd = {w for (w, _) in m2.show_topic(topic, topn=num_words)}
curr_diffs.append(jaccard(fst, snd))
z.append(curr_diffs)
return np.array(z).T
def topic_cov(z_diffs):
z_diffs /= np.max(z_diffs)
return np.sum(z_diffs, axis=0) / z_diffs.shape[0]
pths = [_[1] for _ in keyed_models]
%time z_diffs_jcd = smodel_diff_jcd(pths)
%time z_diffs_kl = smodel_diff_kl(pths, distance="kulback_leibler")
%time z_diffs_hr = smodel_diff_kl(pths, distance="hellinger")
draw_heatmap(z_diffs_jcd,
title="Topic diff between updates [Jaccard]", x_title="Epoch diff", y_title="Topics")
draw_heatmap(z_diffs_kl,
title="Topic diff between updates [KL]", x_title="Epoch diff", y_title="Topics")
draw_heatmap(z_diffs_hr,
title="Topic diff between updates [Hellinger]", x_title="Epoch diff", y_title="Topics")
import json
from gensim.models.coherencemodel import CoherenceModel
def calc_perplexity(model_pths, holdout_path):
with open(holdout_path) as infile:
holdout = [json.loads(line)["d2b"] for line in infile]
result = []
for idx, p in enumerate(model_pths):
logging.info("Model %d of %d", idx + 1, len(model_pths))
m = LdaMulticore.load(p)
result.append(float(np.exp2(-m.log_perplexity(holdout))))
return result
def calc_coherence(model_pths, holdout_path):
with open(holdout_path) as infile:
holdout = [json.loads(line)["d2b"] for line in infile]
result = []
for idx, p in enumerate(model_pths):
logging.info("Model %d of %d", idx + 1, len(model_pths))
m = LdaMulticore.load(p)
cm = CoherenceModel(model=m, corpus=holdout, coherence='u_mass')
result.append(float(cm.get_coherence()))
return result
import os
!mkdir -p cache/
# use cache if already calculated
if not os.path.isfile("cache/perplexity.txt"):
%time perplexity = calc_perplexity(pths, "dataset/holdout.json")
with open("cache/perplexity.txt", 'w') as outfile:
for perpl in perplexity:
outfile.write("{}\n".format(perpl))
with open("cache/perplexity.txt") as infile:
perplexity = [float(line.strip()) for line in infile]
!mkdir -p cache/
# use cache if already calculated
if not os.path.isfile("cache/coherence.txt"):
%time coherence = calc_coherence(pths, "dataset/holdout.json")
with open("cache/coherence.txt", 'w') as outfile:
for coh in coherence:
outfile.write("{}\n".format(coh))
with open("cache/coherence.txt") as infile:
coherence = [float(line.strip()) for line in infile]
perplexity_pr = map(np.mean, zip(perplexity, perplexity[1:]))
coherence_pr = map(np.mean, zip(coherence, coherence[1:]))
data = [go.Scatter(y=topic_cov(z_diffs_jcd), name="sum(jaccard)"),
go.Scatter(y=topic_cov(z_diffs_kl), name="sum(KL)"),
go.Scatter(y=topic_cov(z_diffs_hr), name="sum(Hellinger)")]
layout = go.Layout(
title="Topic convergence",
xaxis=dict(title="Epoch diff")
)
py.iplot(go.Figure(data=data, layout=layout))
py.iplot(go.Figure(data=[go.Scatter(y=perplexity_pr,
name="log(perplexity)")],
layout=go.Layout(title="Perplexity")))
py.iplot(go.Figure(data=[go.Scatter(y=coherence_pr,
name="Coherence u_mass")],
layout=go.Layout(title="Topic coherence")))